The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.

Dependencies

User defined parameters

print(params)
## $test_run
## [1] FALSE
## 
## $save_figs
## [1] FALSE
## 
## $response
## [1] "HerbCover_dec"
## 
## $hmod
## [1] FALSE
## 
## $s
## [1] "HerbCover"
## 
## $sample_group
## [1] 1
## 
## $byRegion
## [1] TRUE
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
hmod <- params$hmod # whether to include human modification in the model
# by changing the sample_group the model can be fit to a completely different set of rows
sample_group <- params$sample_group 
response <- params$response
# _ann defines this model as being built on annual data
s <- params$s # string to paste to file names e.g., that defines the interactions in the model
# such as (summer ppt * MAT) and annuals*temperature interactions
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
byRegion <- params$byRegion
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")

library(terra)
library(tidyterra)
library(sf)
library(caret)
library(betareg)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc. 
library(ggtext)
library(StepBeta)
theme_set(theme_classic())

read in data

Data compiled in the prepDataForModels.R script

modDat <- readRDS("../../../data/DataForModels_withSubEcoreg.rds")

Prep data

set.seed(1234)
modDat_1 <- modDat %>% 
  mutate(Lon = st_coordinates(.)[,1], 
         Lat = st_coordinates(.)[,2])  %>% 
  st_drop_geometry() %>% 
  filter(!is.na(newRegion))

# small dataset for if testing the data
if(test_run) {
  modDat_1 <- slice_sample(modDat_1, n = 1e5)
}

For now, not doing any resampling

set.seed(1234)

pred_vars <- c("swe_meanAnnAvg_30yr", "tmean_meanAnnAvg_30yr", "prcp_meanAnnTotal_30yr", "PrecipTempCorr_meanAnnAvg_30yr", "isothermality_meanAnnAvg_30yr", "annWetDegDays_meanAnnAvg_30yr")
# removed VPD, since it's highly correlated w/ tmean and prcp

names(pred_vars) <- pred_vars

# predictor vars are the same in both dfs

## remove rows for data that have NAs in the predictors (lag starts before range of DayMet data)

modDat_1 <- modDat_1 %>% 
  drop_na(swe_meanAnnAvg_30yr, tmean_meanAnnAvg_30yr, prcp_meanAnnTotal_30yr, 
          precip_Seasonality_meanAnnAvg_30yr, PrecipTempCorr_meanAnnAvg_30yr, 
          isothermality_meanAnnAvg_30yr, all_of(response))
  

df_pred <- modDat_1[, pred_vars]

Break the data into three groups for model fitting based on lumped ecoregions

if (byRegion == TRUE) {
  modDat_westForest <- modDat_1 %>% 
  filter(newRegion == "westForest")
  
  modDat_eastForest <- modDat_1 %>% 
  filter(newRegion == "eastForest")
  
  modDat_shrubGrass <- modDat_1 %>% 
  filter(newRegion == "dryShrubGrass")
}

Training data

if (byRegion == TRUE) {
  ## for western forests
  wf_sample <- if(fit_sample & !test_run) {
  reordered <- slice_sample(modDat_westForest, prop = 1)
  
  low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
  if(high > nrow(modDat_westForest)) {
    warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
    n_train <- n_train*.8
     low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
   #stop('trying to sample from rows that dont exist')
  }
  reordered[low:high, ]
} else {
    modDat_westForest
}

wf_test <- if(fit_sample & !test_run & 
              # antijoin only works if there are enough rows that meet 
              # that criterion, i.e. if wf_sample contains most of the data i
              # doesnt' work
              (nrow(modDat_westForest) - nrow(wf_sample) > n_test)) {
  modDat_westForest %>% 
    anti_join(wf_sample, by = c("cell_num", "year")) %>% 
    slice_sample(n = n_test)
} else {
  modDat_westForest %>% 
    slice_sample(n = n_test)
}

# small sample for certain plots
wf_small <- slice_sample(modDat_westForest, n = 1e5)

# for eastern forests
ef_sample <- if(fit_sample & !test_run) {
  reordered <- slice_sample(modDat_eastForest, prop = 1)
  
  low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
  if(high > nrow(modDat_eastForest)) {
    warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
    n_train <- n_train*.8
     low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
   #stop('trying to sample from rows that dont exist')
  }
  reordered[low:high, ]
} else {
    modDat_eastForest
}

ef_test <- if(fit_sample & !test_run & 
              # antijoin only works if there are enough rows that meet 
              # that criterion, i.e. if ef_sample contains most of the data i
              # doesnt' work
              (nrow(modDat_eastForest) - nrow(ef_sample) > n_test)) {
  modDat_eastForest %>% 
    anti_join(ef_sample, by = c("cell_num", "year")) %>% 
    slice_sample(n = n_test)
} else {
  modDat_eastForest %>% 
    slice_sample(n = n_test)
}

# small sample for certain plots
ef_small <- slice_sample(modDat_eastForest, n = 1e5)

## for grass/shrubs
g_sample <- if(fit_sample & !test_run) {
  reordered <- slice_sample(modDat_shrubGrass, prop = 1)
  
  low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
  if(high > nrow(modDat_shrubGrass)) {
    warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
    n_train <- n_train*.8
     low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
   #stop('trying to sample from rows that dont exist')
  }
  reordered[low:high, ]
} else {
    modDat_shrubGrass
}

g_test <- if(fit_sample & !test_run & 
              # antijoin only works if there are enough rows that meet 
              # that criterion, i.e. if g_sample contains most of the data i
              # doesnt' work
              (nrow(modDat_shrubGrass) - nrow(g_sample) > n_test)) {
  modDat_shrubGrass %>% 
    anti_join(g_sample, by = c("cell_num", "year")) %>% 
    slice_sample(n = n_test)
} else {
  modDat_shrubGrass %>% 
    slice_sample(n = n_test)
}

# small sample for certain plots
g_small <- slice_sample(modDat_shrubGrass, n = 1e5)

## do full dataset as well 
df_sample <- if(fit_sample & !test_run) {
  reordered <- slice_sample(modDat_1, prop = 1)
  
  low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
  if(high > nrow(modDat_1)) {
    warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
    n_train <- n_train*.8
     low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
   #stop('trying to sample from rows that dont exist')
  }
  reordered[low:high, ]
} else {
    modDat_1
}

df_test <- if(fit_sample & !test_run & 
              # antijoin only works if there are enough rows that meet 
              # that criterion, i.e. if df_sample contains most of the data i
              # doesnt' work
              (nrow(modDat_1) - nrow(df_sample) > n_test)) {
  modDat_1 %>% 
    anti_join(df_sample, by = c("cell_num", "year")) %>% 
    slice_sample(n = n_test)
} else {
  modDat_1 %>% 
    slice_sample(n = n_test)
}

# small sample for certain plots
df_small <- slice_sample(modDat_1, n = 1e5)

} else  if (byRegion == FALSE) {
  
  df_sample <- if(fit_sample & !test_run) {
  reordered <- slice_sample(modDat_1, prop = 1)
  
  low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
  if(high > nrow(modDat_1)) {
    warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
    n_train <- n_train*.8
     low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
   #stop('trying to sample from rows that dont exist')
  }
  reordered[low:high, ]
} else {
    modDat_1
}

df_test <- if(fit_sample & !test_run & 
              # antijoin only works if there are enough rows that meet 
              # that criterion, i.e. if df_sample contains most of the data i
              # doesnt' work
              (nrow(modDat_1) - nrow(df_sample) > n_test)) {
  modDat_1 %>% 
    anti_join(df_sample, by = c("cell_num", "year")) %>% 
    slice_sample(n = n_test)
} else {
  modDat_1 %>% 
    slice_sample(n = n_test)
}

# small sample for certain plots
df_small <- slice_sample(modDat_1, n = 1e5)
}

Exploratory figs & summary values

create_summary <- function(df) {
  df %>% 
    pivot_longer(cols = everything(),
                 names_to = 'variable') %>% 
    group_by(variable) %>% 
    summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), 
                                        median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>% 
    mutate(across(where(is.numeric), round, 4))
}

modDat_1[pred_vars] %>% 
  create_summary() %>% 
  knitr::kable(caption = 'summaries of predictor variables')
summaries of predictor variables
variable value_mean value_min value_median value_max
PrecipTempCorr_meanAnnAvg_30yr -0.2422 -0.8556 -0.2809 0.7145
annWetDegDays_meanAnnAvg_30yr 1590.6907 229.5051 1589.2014 2713.2516
isothermality_meanAnnAvg_30yr -37.4865 -60.9401 -36.9568 -20.2171
prcp_meanAnnTotal_30yr 740.7390 52.1693 483.0162 4116.7860
swe_meanAnnAvg_30yr 17.8939 0.0000 3.6433 1435.7015
tmean_meanAnnAvg_30yr 6.7665 2.2173 6.9464 10.3473
response_summary <- modDat_1 %>% 
    dplyr::select(#where(is.numeric), -all_of(pred_vars),
      matches(response)) %>% 
    create_summary()


kable(response_summary, 
      caption = 'summaries of response variables, calculated using paint')
summaries of response variables, calculated using paint
variable value_mean value_min value_median value_max
HerbCover_dec 0.1601 1e-04 0.055 0.999

Plot predictor vars against each other

here using pred dataframe, because smaller and this code is slow.

df_pred %>% 
  slice_sample(n = 5e4) %>% 
  #select(-matches("_")) %>% 
ggpairs(lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.2)), 
        columnLabels = c("swe", "tmean", "prcp", "prcpTempCorr", "isothermality", "wetDegDays"))

boxplots– # predictor variables compared to binned response variables

# vectors of names of response variables
vars_response <- response

# longformat dataframes for making boxplots
if (byRegion == FALSE) {
df_sample_plots <-  df_sample %>% 
  rename(response = all_of(response)) %>% 
  mutate(response = case_when(
    response <= .25 ~ ".25", 
    response > .25 & response <=.5 ~ ".5", 
    response > .5 & response <=.75 ~ ".75", 
    response >= .75  ~ "1", 
  )) %>% 
  select(c(response, pred_vars)) %>% 
  pivot_longer(cols = names(pred_vars), 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value)) 
} else {
  df_sample_plots <-  df_sample %>% 
  select(c(response, pred_vars, newRegion))  %>% 
  rename(response = all_of(response), 
         swe = swe_meanAnnAvg_30yr, 
         MAT = tmean_meanAnnAvg_30yr,
         MAP = prcp_meanAnnTotal_30yr,
         PrecipTempCorr = PrecipTempCorr_meanAnnAvg_30yr, 
         isothermality = isothermality_meanAnnAvg_30yr, 
         wetDegDays = annWetDegDays_meanAnnAvg_30yr
         ) %>% 
  mutate(response = case_when(
    response <= .25 ~ ".25", 
    response > .25 & response <=.5 ~ ".5", 
    response > .5 & response <=.75 ~ ".75", 
    response >= .75  ~ "1", 
  ))  %>% 
  pivot_longer(cols = c(swe, MAT, MAP, PrecipTempCorr, isothermality, wetDegDays), 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value)) 
}

# df_biome_long <- 
#   # for some reason select() was giving me problems
#   # adding numYrs here so can take weighted average
#   predvars2long(modDat_1, response_vars = c(vars_response), 
#                 pred_vars = pred_vars) #%>% 
#    # mutate(across(all_of(vars_nfire), factor))


if (byRegion == FALSE) {
  ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
  geom_boxplot() +
  facet_wrap(~predictor, scales = 'free_y') + 
  ylab("Predictor Variable Values")
} else {
  ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
  geom_boxplot() +
  facet_wrap(~predictor + newRegion , scales = 'free_y') + 
  ylab("Predictor Variable Values") 
}

GLMs

fitting models

fitting various transforms

Creating formulas where each variable on its own is transformed numerous ways (including formula to fit spline), all other variables are left alone, that repeated for each variable. So have models with 1 variable transformed, 2 transformed, etc.

see documentation for glms_iterate_transforms, in the modelling_functions.R script

set.seed(1234)

# adding an interaction term to help deal with over-predicting fire probability
# at pixels with high afgAGB and high MAP. parentheses around interaction
# term are included so that glms_iterate_transforms doesn't transform
# the interaction term. 
#pred_vars_inter <- c(pred_vars, params$inter)


# pred_vars_inter <- pred_vars
mods_glm_transf_temp <- glmTransformsIterates(
  preds = pred_vars, 
  df = df_sample,
  response_var = response, 
  delta_aic = 10)
## [1] "step1"
## 
##  -78041.42 
## -78735.71 
## delta aic cutoff 10 
## [1] "step2"
## 
##  -78735.71 
## -79359.01 
## delta aic cutoff 10 
## [1] "step3"
## 
##  -79359.01 
## -79791.57 
## delta aic cutoff 10 
## [1] "step4"
## 
##  -79791.57 
## -80171.32 
## delta aic cutoff 10 
## [1] "step5"
## 
##  -80171.32 
## -80441.36 
## delta aic cutoff 10 
## [1] "step6"
## 
##  -80441.36 
## -80670.43 
## delta aic cutoff 10
# not including the last element of the list which is final_formula
mods_glm_transf <- mods_glm_transf_temp[-length(mods_glm_transf_temp)]

map_dfc(mods_glm_transf, ~ names(.$aic[1:5])) %>% 
  kable(caption = "5 best transformations at each step")
5 best transformations at each step
step1 step2 step3 step4 step5 step6
convert_poly2log10_swe_meanAnnAvg_30yr convert_poly2_annWetDegDays_meanAnnAvg_30yr convert_poly2log10_PrecipTempCorr_meanAnnAvg_30yr convert_poly2_prcp_meanAnnTotal_30yr convert_poly2_isothermality_meanAnnAvg_30yr convert_poly2_tmean_meanAnnAvg_30yr
convert_poly2_isothermality_meanAnnAvg_30yr convert_poly2sqrt_annWetDegDays_meanAnnAvg_30yr convert_poly2_isothermality_meanAnnAvg_30yr convert_poly2sqrt_prcp_meanAnnTotal_30yr convert_poly2sqrt_tmean_meanAnnAvg_30yr convert_poly2sqrt_tmean_meanAnnAvg_30yr
convert_poly2_prcp_meanAnnTotal_30yr convert_poly2_tmean_meanAnnAvg_30yr convert_log10_PrecipTempCorr_meanAnnAvg_30yr convert_log10_prcp_meanAnnTotal_30yr convert_poly2_tmean_meanAnnAvg_30yr convert_poly2log10_tmean_meanAnnAvg_30yr
convert_poly2log10_PrecipTempCorr_meanAnnAvg_30yr convert_poly2_isothermality_meanAnnAvg_30yr convert_poly2_tmean_meanAnnAvg_30yr convert_poly2log10_prcp_meanAnnTotal_30yr convert_poly2log10_tmean_meanAnnAvg_30yr convert_log10_tmean_meanAnnAvg_30yr
convert_poly2sqrt_swe_meanAnnAvg_30yr convert_poly2sqrt_tmean_meanAnnAvg_30yr convert_poly2sqrt_tmean_meanAnnAvg_30yr convert_poly2_isothermality_meanAnnAvg_30yr convert_log10_tmean_meanAnnAvg_30yr convert_sqrt_tmean_meanAnnAvg_30yr
best_aic_by_step <- map_dbl(mods_glm_transf, ~.$aic[1])

best_aic_by_step <- c(step0 = mods_glm_transf_temp$step1$aic[['convert_none']], 
                      best_aic_by_step)
# AIC improvement by step
diff(best_aic_by_step)
##     step1     step2     step3     step4     step5     step6 
## -694.2933 -623.3033 -432.5603 -379.7451 -270.0416 -229.0660
plot(y = best_aic_by_step, 
     x = 1:length(best_aic_by_step) - 1,
     ylab = "AIC",
     xlab = "Number of variables transformed")

if (byRegion == TRUE) {
  ## for western forests
  mods_glm_transf_WF_temp <- glmTransformsIterates(
  preds = pred_vars, 
  df = wf_sample,
  response_var = response, 
  delta_aic = 10)

# not including the last element of the list which is final_formula
mods_glm_transf_WF <- mods_glm_transf_WF_temp[-length(mods_glm_transf_WF_temp)]

map_dfc(mods_glm_transf_WF, ~ names(.$aic[1:5])) %>% 
  kable(caption = "5 best transformations at each step")
best_aic_by_step_WF <- map_dbl(mods_glm_transf_WF, ~.$aic[1])

best_aic_by_step_WF <- c(step0 = mods_glm_transf_WF_temp$step1$aic[['convert_none']], 
                      best_aic_by_step_WF)
# AIC improvement by step
diff(best_aic_by_step_WF)
plot(y = best_aic_by_step_WF, 
     x = 1:length(best_aic_by_step_WF) - 1,
     ylab = "AIC",
     xlab = "Number of variables transformed")

## for eastern forests
  mods_glm_transf_EF_temp <- glmTransformsIterates(
  preds = pred_vars, 
  df = ef_sample,
  response_var = response, 
  delta_aic = 10)

# not including the last element of the list which is final_formula
mods_glm_transf_EF <- mods_glm_transf_EF_temp[-length(mods_glm_transf_EF_temp)]

map_dfc(mods_glm_transf_EF, ~ names(.$aic[1:5])) %>% 
  kable(caption = "5 best transformations at each step")
best_aic_by_step_EF <- map_dbl(mods_glm_transf_EF, ~.$aic[1])

best_aic_by_step_EF <- c(step0 = mods_glm_transf_EF_temp$step1$aic[['convert_none']], 
                      best_aic_by_step_EF)
# AIC improvement by step
diff(best_aic_by_step_EF)
plot(y = best_aic_by_step_EF, 
     x = 1:length(best_aic_by_step_EF) - 1,
     ylab = "AIC",
     xlab = "Number of variables transformed")

## for grasslands/shrubs
  mods_glm_transf_G_temp <- glmTransformsIterates(
  preds = pred_vars, 
  df = g_sample,
  response_var = response, 
  delta_aic = 10)

# not including the last element of the list which is final_formula
mods_glm_transf_G <- mods_glm_transf_G_temp[-length(mods_glm_transf_G_temp)]

map_dfc(mods_glm_transf_G, ~ names(.$aic[1:5])) %>% 
  kable(caption = "5 best transformations at each step")
best_aic_by_step_G <- map_dbl(mods_glm_transf_G, ~.$aic[1])

best_aic_by_step_G <- c(step0 = mods_glm_transf_G_temp$step1$aic[['convert_none']], 
                      best_aic_by_step_G)
# AIC improvement by step
diff(best_aic_by_step_G)
plot(y = best_aic_by_step_G, 
     x = 1:length(best_aic_by_step_G) - 1,
     ylab = "AIC",
     xlab = "Number of variables transformed")
}
## [1] "step1"
## 
##  -100137.9 
## -100548.2 
## delta aic cutoff 10 
## [1] "step2"
## 
##  -100548.2 
## -100865.6 
## delta aic cutoff 10 
## [1] "step3"
## 
##  -100865.6 
## -100913.4 
## delta aic cutoff 10 
## [1] "step4"
## 
##  -100913.4 
## -100955.9 
## delta aic cutoff 10 
## [1] "step5"
## 
##  -100955.9 
## -100989 
## delta aic cutoff 10 
## [1] "step6"
## 
##  -100989 
## -101020.7 
## delta aic cutoff 10

## [1] "step1"
## 
##  -35202.11 
## -35791.82 
## delta aic cutoff 10 
## [1] "step2"
## 
##  -35791.82 
## -36210.56 
## delta aic cutoff 10 
## [1] "step3"
## 
##  -36210.56 
## -36266.4 
## delta aic cutoff 10 
## [1] "step4"
## 
##  -36266.4 
## -36314.62 
## delta aic cutoff 10 
## [1] "step5"
## 
##  -36314.62 
## -36331.8 
## delta aic cutoff 10 
## [1] "step6"
## 
##  -36331.8 
## -36591.51 
## delta aic cutoff 10

## [1] "step1"
## 
##  -102156.5 
## -103430.2 
## delta aic cutoff 10 
## [1] "step2"
## 
##  -103430.2 
## -103724.5 
## delta aic cutoff 10 
## [1] "step3"
## 
##  -103724.5 
## -104043.8 
## delta aic cutoff 10 
## [1] "step4"
## 
##  -104043.8 
## -104407.2 
## delta aic cutoff 10 
## [1] "step5"
## 
##  -104407.2 
## -104425.3 
## delta aic cutoff 10 
## [1] "step6"
## 
##  -104425.3 
## -104433.6 
## delta aic cutoff 10

Best transformation each step.

bestTransformed <- mods_glm_transf_temp$final_formula
if (byRegion == TRUE) {
  # for western forests
  #var_transformed_WF <- map(mods_glm_transf_WF, function(x) x$var_transformed)
bestTransformed_WF <- mods_glm_transf_WF_temp$final_formula

# for eastern forests
#var_transformed_EF <- map(mods_glm_transf_EF, function(x) x$var_transformed)
bestTransformed_EF <- mods_glm_transf_EF_temp$final_formula

# for grass/shrub
#var_transformed_G <- map(mods_glm_transf_G, function(x) x$var_transformed)
bestTransformed_G <- mods_glm_transf_G_temp$final_formula
}

adding interaction terms

Plot potential interaction terms using raw data

across values of swe

# calculate quantiles
swe_quants <- quantile(df_sample$swe_meanAnnAvg_30yr, na.rm = TRUE)
# longformat dataframe for making boxplots
df_sample_swe <-  df_sample %>% 
  select(c(response, pred_vars)) %>% 
  # transform the predictors according to above process
  mutate(isotherm = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
        
          wetDegDays= as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
         swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
        #swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
         tmean_log =            as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
         #tmean_log_squared =    as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
         prcp_log =             as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
        # prcp_log_squared =     as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
         prcpTempCorr = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,1])
         ) %>% 
  rename(response = all_of(response)) %>% 
  # mutate(response = case_when(
  #   response <= .25 ~ ".25", 
  #   response > .25 & response <=.5 ~ ".5", 
  #   response > .5 & response <=.75 ~ ".75", 
  #   response >= .75  ~ "1", 
  # )) %>% 
  # pivot_longer(cols = 2:16, 
  #              names_to = "predictor", 
  #              values_to = "value") %>% 
  # filter(!is.na(response) & !is.na(value))
  mutate(swe_meanAnnAvg_30yr_quants = case_when(
    swe_meanAnnAvg_30yr <= swe_quants[2] ~ swe_quants[2], 
    swe_meanAnnAvg_30yr > swe_quants[2]  & swe_meanAnnAvg_30yr <= swe_quants[3]  ~ swe_quants[3], 
    swe_meanAnnAvg_30yr > swe_quants[3]  & swe_meanAnnAvg_30yr <= swe_quants[4]  ~ swe_quants[4], 
    swe_meanAnnAvg_30yr >= swe_quants[4]  ~ swe_quants[5], 
  )) %>% 
  pivot_longer(cols = 2:13, 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value))

# plot relationships amongst variables according to different levels of SWE 
ggplot(data = df_sample_swe[!(df_sample_swe$predictor %in% c("swe_log","swe_meanAnnAvg_30yr")),]) + 
  #geom_point(aes(y = response, x = value, col = as.factor(swe_meanAnnAvg_30yr_quants)), alpha = .5) +
  geom_smooth(aes(y = response, x = value,  col = as.factor(swe_meanAnnAvg_30yr_quants)), method = "gam") + 
  facet_wrap(~predictor, scales = "free_x")

Mean Annual Temperature

tmean_quants <- quantile(df_sample$tmean_meanAnnAvg_30yr, na.rm = TRUE)

# longformat dataframe for making boxplots
df_sample_tmean <-  df_sample %>% 
  select(c(response, pred_vars)) %>% 
  # transform the predictors according to above process
  mutate(isotherm = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
        
          wetDegDays= as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
         swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
        #swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
         tmean_log =            as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
         #tmean_log_squared =    as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
         prcp_log =             as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
        # prcp_log_squared =     as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
         prcpTempCorr = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,1])
         ) %>% 
  rename(response = all_of(response)) %>% 
  # mutate(response = case_when(
  #   response <= .25 ~ ".25", 
  #   response > .25 & response <=.5 ~ ".5", 
  #   response > .5 & response <=.75 ~ ".75", 
  #   response >= .75  ~ "1", 
  # )) %>% 
  # pivot_longer(cols = 2:16, 
  #              names_to = "predictor", 
  #              values_to = "value") %>% 
  # filter(!is.na(response) & !is.na(value))
  mutate(tmean_meanAnnAvg_30yr_quants = case_when(
    tmean_meanAnnAvg_30yr <= tmean_quants[2] ~ tmean_quants[2], 
    tmean_meanAnnAvg_30yr > tmean_quants[2]  & tmean_meanAnnAvg_30yr <= tmean_quants[3]  ~ tmean_quants[3], 
    tmean_meanAnnAvg_30yr > tmean_quants[3]  & tmean_meanAnnAvg_30yr <= tmean_quants[4]  ~ tmean_quants[4], 
    tmean_meanAnnAvg_30yr >= tmean_quants[4]  ~ tmean_quants[5], 
  )) %>% 
  pivot_longer(cols = 2:13, 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value))

# plot relationships amongst variables according to different levels of tmean 
ggplot(data = df_sample_tmean[!(df_sample_tmean$predictor %in% c("tmean_log", "tmean_meanAnnAvg_30yr")),]) + 
  #geom_point(aes(y = response, x = value, col = as.factor(tmean_meanAnnAvg_30yr_quants)), alpha = .3) +
  geom_smooth(aes(y = response, x = value,  col = as.factor(tmean_meanAnnAvg_30yr_quants)), method = "gam") + 
  facet_wrap(~predictor, scales = "free_x")

Mean annual precipitation

prcp_quants <- quantile(df_sample$prcp_meanAnnTotal_30yr, na.rm = TRUE)

# longformat dataframe for making boxplots
df_sample_prcp <-  df_sample %>% 
  select(c(response, pred_vars)) %>% 
  # transform the predictors according to above process
  mutate(isotherm = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
        
          wetDegDays= as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
         swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
        #swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
         tmean_log =            as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
         #tmean_log_squared =    as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
         prcp_log =             as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
        # prcp_log_squared =     as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
         prcpTempCorr = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,1])
         ) %>% 
  rename(response = all_of(response)) %>% 
  # mutate(response = case_when(
  #   response <= .25 ~ ".25", 
  #   response > .25 & response <=.5 ~ ".5", 
  #   response > .5 & response <=.75 ~ ".75", 
  #   response >= .75  ~ "1", 
  # )) %>% 
  # pivot_longer(cols = 2:16, 
  #              names_to = "predictor", 
  #              values_to = "value") %>% 
  # filter(!is.na(response) & !is.na(value))
  mutate(prcp_meanAnnTotal_30yr_quants = case_when(
    prcp_meanAnnTotal_30yr <= prcp_quants[2] ~ prcp_quants[2], 
    prcp_meanAnnTotal_30yr > prcp_quants[2]  & prcp_meanAnnTotal_30yr <= prcp_quants[3]  ~ prcp_quants[3], 
    prcp_meanAnnTotal_30yr > prcp_quants[3]  & prcp_meanAnnTotal_30yr <= prcp_quants[4]  ~ prcp_quants[4], 
    prcp_meanAnnTotal_30yr >= prcp_quants[4]  ~ prcp_quants[5], 
  )) %>% 
  pivot_longer(cols = 2:13, 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value))

# plot relationships amongst variables according to different levels of tmean 
ggplot(data = df_sample_prcp[!(df_sample_prcp$predictor %in% c("prcp_log", "prcp_meanAnnTotal_30yr")),]) + 
  #geom_point(aes(y = response, x = value, col = as.factor(prcp_meanAnnTotal_30yr_quants)), alpha = .3) +
  geom_smooth(aes(y = response, x = value,  col = as.factor(prcp_meanAnnTotal_30yr_quants)), method = "gam") + 
  facet_wrap(~predictor, scales = "free_x")

we’ll try running the model w/ all possible interactions

  ## make table of possible interactions
# make list of transformed variables into a data.frame
var_transformed_df <- as.data.frame(str_remove(bestTransformed, paste0(response, " ~ ")) %>% 
  str_split(pattern = fixed(" + "), simplify = TRUE))#purrr::flatten_dfr(var_transformed_WF) #%>%


# if has a "stats::poly( to start, remove that part
temp <- apply(var_transformed_df, MARGIN = 2, FUN = function(x) {
if(str_detect(x, pattern = fixed("stats::poly("))) {
  x_new <- str_remove(x, pattern = fixed("stats::poly(")) %>%
    str_remove_all(pattern = " ") %>% 
  str_remove(pattern = fixed(",2,raw=TRUE)")) 
} else {
  x_new <- x
}
})
  
# combine into the 32 possible interactions
combos <- data.frame(gtools::permutations(n = length(temp),
r = 2,
v = temp))

combos$interactions <- paste0(combos$X1, ":", combos$X2)
# make text string of interactions
int_string <-str_flatten(combos$interactions, collapse = " + ")
# formula w/ interactions included
modWithInteractions <- paste(bestTransformed, "+",int_string) %>% 
str_remove_all(fixed("stats::"))
# refitting the glm with the best formula
mod_glm1 <- betareg(as.formula(modWithInteractions), data = df_sample, link = c("logit"), link.phi = NULL,
 type = c("ML"))                   
 

if (byRegion == TRUE) {
  # western forests
   ## make table of possible interactions
# make list of transformed variables into a data.frame
var_transformed_df_WF <- as.data.frame(str_remove(bestTransformed_WF, paste0(response, " ~ ")) %>% 
  str_split(pattern = fixed(" + "), simplify = TRUE))#purrr::flatten_dfr(var_transformed_WF) #%>%

# if has a "stats::poly( to start, remove that part
temp_WF <- apply(var_transformed_df_WF, MARGIN = 2, FUN = function(x) {
if(str_detect(x, pattern = fixed("stats::poly("))) {
  x_new <- str_remove(x, pattern = fixed("stats::poly(")) %>%
    str_remove_all(" ") %>% 
  str_remove(pattern = fixed(",2,raw=TRUE)"))
} else {
  x_new <- x
}
})
  
# combine into the 32 possible interactions
combos_WF <- data.frame(gtools::permutations(n = length(temp_WF),
r = 2,
v = temp_WF))

combos_WF$interactions <- paste0(combos_WF$X1, ":", combos_WF$X2)
# make text string of interactions
int_string_WF <-str_flatten(combos_WF$interactions, collapse = " + ")
# formula w/ interactions included
modWithInteractions_WF <- paste(bestTransformed_WF, "+", int_string_WF) %>% str_remove_all(fixed("stats::"))
# refitting the glm with the best formula
mod_glm1_WF <- betareg(as.formula(modWithInteractions_WF), data = wf_sample, link = c("logit"), link.phi = NULL,
 type = c("ML"))    

  # eastern forests
    ## make table of possible interactions
# make list of transformed variables into a data.frame
var_transformed_df_EF <- as.data.frame(str_remove(bestTransformed_EF, paste0(response, " ~ ")) %>% 
  str_split(pattern = fixed(" + "), simplify = TRUE))#purrr::flatten_dfr(var_transformed_EF) #%>%

# if has a "stats::poly( to start, remove that part
temp_EF <- apply(var_transformed_df_EF, MARGIN = 2, FUN = function(x) {
if(str_detect(x, pattern = fixed("stats::poly("))) {
  x_new <- str_remove(x, pattern = fixed("stats::poly(")) %>%
    str_remove_all(" ") %>% 
  str_remove(pattern = fixed(",2,raw=TRUE)"))
} else {
  x_new <- x
}
})
  
# combine into the 30 possible interactions
combos_EF <- data.frame(gtools::permutations(n = length(temp_EF),
r = 2,
v = temp_EF))

combos_EF$interactions <- paste0(combos_EF$X1, ":", combos_EF$X2)
# make text string of interactions
int_string_EF <-str_flatten(combos_EF$interactions, collapse = " + ")
# formula w/ interactions included
modWithInteractions_EF <- paste(bestTransformed_EF, "+", int_string_EF) %>% str_remove_all(fixed("stats::"))
# refitting the glm with the best formula
mod_glm1_EF <- betareg(as.formula(modWithInteractions_EF), data = ef_sample, link = c("logit"), link.phi = NULL,
 type = c("ML"))    

  ## grassland/shrubs
    ## make table of possible interactions
# make list of transformed variables into a data.frame
var_transformed_df_G <- as.data.frame(str_remove(bestTransformed_G, paste0(response, " ~ ")) %>% 
  str_split(pattern = fixed(" + "), simplify = TRUE))#purrr::flatten_dfr(var_transformed_G) #%>%

# if has a "stats::poly( to start, remove that part
temp_G <- apply(var_transformed_df_G, MARGIN = 2, FUN = function(x) {
if(str_detect(x, pattern = fixed("stats::poly("))) {
  x_new <- str_remove(x, pattern = fixed("stats::poly(")) %>%
    str_remove_all(" ") %>% 
  str_remove(pattern = fixed(",2,raw=TRUE)"))
} else {
  x_new <- x
}
})
  
# combine into the 30 possible interactions
combos_G <- data.frame(gtools::permutations(n = length(temp_G),
r = 2,
v = temp_G))

combos_G$interactions <- paste0(combos_G$X1, ":", combos_G$X2)
# make text string of interactions
int_string_G <-str_flatten(combos_G$interactions, collapse = " + ")
# formula w/ interactions included
modWithInteractions_G <- paste(bestTransformed_G, "+", int_string_G) %>% str_remove_all(fixed("stats::"))
# rGitting the glm with the best formula
mod_glm1_G <- betareg(as.formula(modWithInteractions_G), data = g_sample, link = c("logit"), link.phi = NULL,
 type = c("ML"))  

}

performing backwards AIC-based model selection

Using the “StepBeta” function from the StepBeta R package

modSelection <- StepBeta_mine(mod_glm1, data = df_sample)
## [1] "100 % of the process"
if (byRegion == TRUE) {
  modSelection_WF <- StepBeta_mine(mod_glm1_WF, data = wf_sample)
  modSelection_EF <- StepBeta_mine(mod_glm1_EF, data = ef_sample)
  modSelection_G <- StepBeta_mine(mod_glm1_G, data = g_sample)
}
## [1] "100 % of the process"
## [1] "100 % of the process"
## Error in chol.default(K) : the leading minor of order 4 is not positive
## [1] "100 % of the process"

same model for all response variables

Response variables are the proportion of years in which fires occurred. Using best model formula selected in the previous step

 best_form <- modSelection$formula
print(best_form)
## HerbCover_dec ~ poly(tmean_meanAnnAvg_30yr, 2, raw = TRUE) + 
##     poly(I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 1))), 2, 
##         raw = TRUE) + poly(I(log10(I(swe_meanAnnAvg_30yr + 1))), 
##     2, raw = TRUE) + poly(annWetDegDays_meanAnnAvg_30yr, 2, raw = TRUE) + 
##     poly(prcp_meanAnnTotal_30yr, 2, raw = TRUE) + poly(isothermality_meanAnnAvg_30yr, 
##     2, raw = TRUE) + I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 
##     1))):prcp_meanAnnTotal_30yr + annWetDegDays_meanAnnAvg_30yr:I(log10(I(swe_meanAnnAvg_30yr + 
##     1))) + I(log10(I(swe_meanAnnAvg_30yr + 1))):tmean_meanAnnAvg_30yr + 
##     I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 1))):I(log10(I(swe_meanAnnAvg_30yr + 
##         1))) + I(log10(I(swe_meanAnnAvg_30yr + 1))):prcp_meanAnnTotal_30yr + 
##     I(log10(I(swe_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr + 
##     prcp_meanAnnTotal_30yr:tmean_meanAnnAvg_30yr + I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 
##     1))):tmean_meanAnnAvg_30yr + annWetDegDays_meanAnnAvg_30yr:isothermality_meanAnnAvg_30yr + 
##     isothermality_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr + I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 
##     1))):isothermality_meanAnnAvg_30yr + annWetDegDays_meanAnnAvg_30yr:tmean_meanAnnAvg_30yr + 
##     annWetDegDays_meanAnnAvg_30yr:I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 
##         1))) + annWetDegDays_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr
## <environment: 0x3b229e390>
# refitting the glm with the best formula
mod_glmFinal <- betareg(as.formula(best_form), data = df_sample, link = c("logit"), link.phi = NULL,
                              type = c("ML"))

# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal)
## [1] -87313.39
if (byRegion == TRUE) {
  ## western forests 
  best_form_WF <- modSelection_WF$formula
print(best_form_WF)

# refitting the glm with the best formula
mod_glmFinal_WF <- betareg(as.formula(best_form_WF), data = wf_sample, link = c("logit"), link.phi = NULL,
                              type = c("ML"))
# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal_WF)

 ## easter forests 
  best_form_EF <- modSelection_EF$formula
print(best_form_EF)

# refitting the glm with the best formula
mod_glmFinal_EF <- betareg(as.formula(best_form_EF), data = ef_sample, link = c("logit"), link.phi = NULL,
                              type = c("ML"))
# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal_EF)

 ## grass/shrub
  best_form_G <- modSelection_G$formula
print(best_form_G)

# refitting the glm with the best formula
mod_glmFinal_G <- betareg(as.formula(best_form_G), data = g_sample, link = c("logit"), link.phi = NULL,
                              type = c("ML"))
# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal_G)
}
## HerbCover_dec ~ poly(tmean_meanAnnAvg_30yr, 2, raw = TRUE) + 
##     poly(I(log10(I(annWetDegDays_meanAnnAvg_30yr + 1))), 2, raw = TRUE) + 
##     poly(I(log10(I(swe_meanAnnAvg_30yr + 1))), 2, raw = TRUE) + 
##     poly(isothermality_meanAnnAvg_30yr, 2, raw = TRUE) + poly(PrecipTempCorr_meanAnnAvg_30yr, 
##     2, raw = TRUE) + poly(I(log10(I(prcp_meanAnnTotal_30yr + 
##     1))), 2, raw = TRUE) + I(log10(I(annWetDegDays_meanAnnAvg_30yr + 
##     1))):I(log10(I(prcp_meanAnnTotal_30yr + 1))) + I(log10(I(swe_meanAnnAvg_30yr + 
##     1))):tmean_meanAnnAvg_30yr + I(log10(I(prcp_meanAnnTotal_30yr + 
##     1))):I(log10(I(swe_meanAnnAvg_30yr + 1))) + I(log10(I(prcp_meanAnnTotal_30yr + 
##     1))):tmean_meanAnnAvg_30yr + I(log10(I(prcp_meanAnnTotal_30yr + 
##     1))):isothermality_meanAnnAvg_30yr + isothermality_meanAnnAvg_30yr:tmean_meanAnnAvg_30yr + 
##     isothermality_meanAnnAvg_30yr:PrecipTempCorr_meanAnnAvg_30yr + 
##     I(log10(I(annWetDegDays_meanAnnAvg_30yr + 1))):tmean_meanAnnAvg_30yr + 
##     I(log10(I(annWetDegDays_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr + 
##     I(log10(I(swe_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr + 
##     I(log10(I(annWetDegDays_meanAnnAvg_30yr + 1))):I(log10(I(swe_meanAnnAvg_30yr + 
##         1))) + I(log10(I(swe_meanAnnAvg_30yr + 1))):PrecipTempCorr_meanAnnAvg_30yr + 
##     I(log10(I(prcp_meanAnnTotal_30yr + 1))):PrecipTempCorr_meanAnnAvg_30yr + 
##     PrecipTempCorr_meanAnnAvg_30yr:tmean_meanAnnAvg_30yr + I(log10(I(annWetDegDays_meanAnnAvg_30yr + 
##     1))):PrecipTempCorr_meanAnnAvg_30yr
## <environment: 0x33d3cb9b0>
## HerbCover_dec ~ poly(I(log10(I(annWetDegDays_meanAnnAvg_30yr + 
##     1))), 2, raw = TRUE) + poly(I(sqrt(prcp_meanAnnTotal_30yr)), 
##     2, raw = TRUE) + poly(I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 
##     1))), 2, raw = TRUE) + poly(isothermality_meanAnnAvg_30yr, 
##     2, raw = TRUE) + poly(tmean_meanAnnAvg_30yr, 2, raw = TRUE) + 
##     poly(swe_meanAnnAvg_30yr, 2, raw = TRUE) + isothermality_meanAnnAvg_30yr:tmean_meanAnnAvg_30yr + 
##     I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr + 
##     I(log10(I(annWetDegDays_meanAnnAvg_30yr + 1))):I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 
##         1))) + I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 1))):swe_meanAnnAvg_30yr + 
##     swe_meanAnnAvg_30yr:tmean_meanAnnAvg_30yr + I(log10(I(annWetDegDays_meanAnnAvg_30yr + 
##     1))):swe_meanAnnAvg_30yr + I(sqrt(prcp_meanAnnTotal_30yr)):isothermality_meanAnnAvg_30yr + 
##     I(sqrt(prcp_meanAnnTotal_30yr)):tmean_meanAnnAvg_30yr + isothermality_meanAnnAvg_30yr:swe_meanAnnAvg_30yr + 
##     I(log10(I(annWetDegDays_meanAnnAvg_30yr + 1))):tmean_meanAnnAvg_30yr + 
##     I(log10(I(annWetDegDays_meanAnnAvg_30yr + 1))):I(sqrt(prcp_meanAnnTotal_30yr)) + 
##     I(sqrt(prcp_meanAnnTotal_30yr)):swe_meanAnnAvg_30yr + I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 
##     1))):I(sqrt(prcp_meanAnnTotal_30yr))
## <environment: 0x33d47d898>
## HerbCover_dec ~ poly(prcp_meanAnnTotal_30yr, 2, raw = TRUE) + 
##     poly(I(log10(I(tmean_meanAnnAvg_30yr + 1))), 2, raw = TRUE) + 
##     annWetDegDays_meanAnnAvg_30yr + poly(PrecipTempCorr_meanAnnAvg_30yr, 
##     2, raw = TRUE) + poly(I(log10(I(swe_meanAnnAvg_30yr + 1))), 
##     2, raw = TRUE) + poly(isothermality_meanAnnAvg_30yr, 2, raw = TRUE) + 
##     prcp_meanAnnTotal_30yr:PrecipTempCorr_meanAnnAvg_30yr + I(log10(I(swe_meanAnnAvg_30yr + 
##     1))):prcp_meanAnnTotal_30yr + I(log10(I(swe_meanAnnAvg_30yr + 
##     1))):isothermality_meanAnnAvg_30yr + I(log10(I(tmean_meanAnnAvg_30yr + 
##     1))):prcp_meanAnnTotal_30yr + I(log10(I(swe_meanAnnAvg_30yr + 
##     1))):I(log10(I(tmean_meanAnnAvg_30yr + 1))) + isothermality_meanAnnAvg_30yr:PrecipTempCorr_meanAnnAvg_30yr + 
##     I(log10(I(tmean_meanAnnAvg_30yr + 1))):PrecipTempCorr_meanAnnAvg_30yr + 
##     annWetDegDays_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr + isothermality_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr + 
##     I(log10(I(tmean_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr + 
##     annWetDegDays_meanAnnAvg_30yr:I(log10(I(swe_meanAnnAvg_30yr + 
##         1))) + annWetDegDays_meanAnnAvg_30yr:I(log10(I(tmean_meanAnnAvg_30yr + 
##     1))) + I(log10(I(swe_meanAnnAvg_30yr + 1))):PrecipTempCorr_meanAnnAvg_30yr
## <environment: 0x3b8a24940>
## [1] -106134.9

partial dependence & VIP

PDP plot trend made using a small sample of the data

#vip::vip(mod_glmFinal, num_features = 15)

#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)

#caret::varImp(mod_glmFinal)

observed vs. predicted

Predicting on the data

  # create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
  df_out <- df

  response_name <- paste0(response, "_pred")
  df_out[[response_name]] <- predict(mod, df, type = 'response')
  df_out
}

pred_glm1 <- predict_by_response(mod_glmFinal, df_test)


if (byRegion == TRUE) {
  ## western forests
  # create prediction 
  pred_glm1_WF <- predict_by_response(mod_glmFinal_WF, wf_test)
  
  ## eastern forests
  # create prediction 
  pred_glm1_EF <- predict_by_response(mod_glmFinal_EF, ef_test)
  
  ## grass/shrub
  # create prediction 
  pred_glm1_G <- predict_by_response(mod_glmFinal_G, g_test)
  
  ## add predictions together for later figures
  pred_glm1_ALL <- rbind(pred_glm1_WF, pred_glm1_EF, pred_glm1_G)
}

Maps of Residuals

For CONUS-wide model

pred_glm1 <- pred_glm1 %>% 
  mutate(resid = .[[response]] - .[[paste0(response,"_pred")]]) 

# rasterize
# get reference raster
test_rast <-  rast("../../../data/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>% 
  terra::aggregate(fact = 32, fun = "mean")

# rasterize data
plotResid_rast <- pred_glm1 %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Lon", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) %>% 
  terra::crop(ext(-2000000, 2500000, -2000000, 1200000))
# plot
ggplot() + 
  geom_spatraster(data = plotResid_rast) + 
  ggtitle(paste0("Resids. from Best CONUS wide model of ",s)) + 
  scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white")

# terra::plot(plotResid_rast, main = paste0("Resids. from Best CONUS wide model of ",s), clip = TRUE, 
#             plg = list(title = "resid."), 
#             col = map.pal("curvature"))

If applicable, residual plots of region-level models

if (byRegion == TRUE){
pred_glm1_ALL <- pred_glm1_ALL %>% 
  mutate(resid = .[[response]] - .[[paste0(response,"_pred")]]) 

# rasterize

# rasterize data
plotResid_rast_ALL <- pred_glm1_ALL %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Lon", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) %>% 
  terra::crop(ext(-2000000, 2500000, -2000000, 1200000))
# plot
ggplot() + 
  geom_spatraster(data = plotResid_rast_ALL) + 
  ggtitle(paste0("Resids. from regional models of ",s)) + 
  scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white")
}

if (byRegion == TRUE){
## for western forests
  pred_glm1_WF <- pred_glm1_WF %>% 
  mutate(resid = .[[response]] - .[[paste0(response,"_pred")]]) 

# rasterize

# rasterize data
plotResid_rast_WF <- pred_glm1_WF %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Lon", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) %>% 
  terra::crop(ext(-2000000, 0, -1500000, 1200000))
westForestRast <- ggplot() + 
  geom_spatraster(data = plotResid_rast_WF) + 
  ggtitle(paste0("Resids. from western forest-wide model of ",s)) + 
  scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white") 

## for eastern forests
  pred_glm1_EF <- pred_glm1_EF %>% 
  mutate(resid = .[[response]] - .[[paste0(response,"_pred")]]) 

# rasterize data
plotResid_rast_EF <- pred_glm1_EF %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Lon", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) %>% 
  terra::crop(ext(0, 2500000, -1700000, 1200000))

EastForestRast <- ggplot() + 
  geom_spatraster(data = plotResid_rast_EF) + 
  ggtitle(paste0("Resids. from eastern forest-wide model of ",s)) + 
  scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white") 

## for shrub/grassland
  pred_glm1_G <- pred_glm1_G %>% 
  mutate(resid = .[[response]] - .[[paste0(response,"_pred")]]) 

# rasterize data
plotResid_rast_G <- pred_glm1_G %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Lon", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) %>% 
  terra::crop(ext(-2000000, 900000, -1900000, 1000000))
grassRast <- ggplot() + 
  geom_spatraster(data = plotResid_rast_G) + 
  ggtitle(paste0("Resids. from eastern grass/shrubland-wide model of ",s)) + 
  scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white") 

westForestRast / EastForestRast / grassRast
}

Deciles

Binning predictor variables into deciles (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word deciles is just a legacy thing (they started out being actual deciles)

Then predicting on an identical dataset but with warming

var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)

pred_glm1_deciles <- predvars2deciles(pred_glm1,
                                      response_vars = response_vars,
                                      pred_vars = pred_vars)

if (byRegion == TRUE) {
  ## western forests
  pred_glm1_deciles_WF <- predvars2deciles(pred_glm1_WF,
                                      response_vars = response_vars,
                                      pred_vars = pred_vars) %>% 
    mutate(newRegion = "westernForest")
  
    ## eastern forests
  pred_glm1_deciles_EF <- predvars2deciles(pred_glm1_EF,
                                      response_vars = response_vars,
                                      pred_vars = pred_vars) %>% 
    mutate(newRegion = "easternForest")
  
    ## grass/shrub
  pred_glm1_deciles_G <- predvars2deciles(pred_glm1_G,
                                      response_vars = response_vars,
                                      pred_vars = pred_vars) %>% 
    mutate(newRegion = "grassShrub")
  
  ## put together for later figures 
  pred_glm1_deciles_ALL <- predvars2deciles(pred_glm1_ALL,
                                      response_vars = response_vars,
                                      pred_vars = pred_vars)
}

Publication quality quantile plot

# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles, response = response) + ggtitle("Decile Plot for CONUS-wide model")

if(!hmod) {
# obs/pred inset
g4 <- add_dotplot_inset(g3, pred_glm1_deciles)
} else {
  g4 <- g3
}
if (byRegion == FALSE){
  g4
}

  
if(save_figs) {
  png(paste0("figures/quantile_plots/quantile_plot_v5_CONUS_wideModel_", s,  ".png"), 
     units = "in", res = 600, width = 5.5, height = 3.5 )
    print(g2)
  dev.off()
}
if (byRegion == TRUE) {

  # publication quality version
g <- decile_dotplot_pq(pred_glm1_deciles_ALL, response = response) + ggtitle("Decile Plot for ecoregion-level model")

if(!hmod) {
# obs/pred inset
g2 <- add_dotplot_inset(g, pred_glm1_deciles_ALL)
} else {
  g2 <- g
}

if(save_figs) {
  png(paste0("figures/quantile_plots/quantile_plot_v5_regionLevelModel", s,  ".png"), 
     units = "in", res = 600, width = 5.5, height = 3.5 )
    print(g2)
  dev.off()
}


g4/g2
}

Deciles Filtered

20th and 80th percentiles for each climate variable

df <- pred_glm1[, pred_vars] #%>% 
  #mutate(MAT = MAT - 273.15) # k to c
map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)
## $swe_meanAnnAvg_30yr
##         20%         80% 
##  0.03454667 19.43416382 
## 
## $tmean_meanAnnAvg_30yr
##      20%      80% 
## 5.686425 7.806529 
## 
## $prcp_meanAnnTotal_30yr
##       20%       80% 
##  315.6531 1238.5913 
## 
## $PrecipTempCorr_meanAnnAvg_30yr
##        20%        80% 
## -0.6686219  0.1851129 
## 
## $isothermality_meanAnnAvg_30yr
##       20%       80% 
## -41.59266 -33.60486 
## 
## $annWetDegDays_meanAnnAvg_30yr
##      20%      80% 
## 1312.369 1898.633

Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each climate variable.

clim_vars <- c("swe_meanAnnAvg_30yr", "tmean_meanAnnAvg_30yr", "prcp_meanAnnTotal_30yr", "precip_Seasonality_meanAnnAvg_30yr", "isothermality_meanAnnAvg_30yr", "annWetDegDays_meanAnnAvg_30yr")
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = pred_vars,
                         filter_var = TRUE,
                         filter_vars = pred_vars) 

decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = clim_vars)

#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)

Filtered quantile figure with middle 2 deciles also shown (this is very memory intensive so no running at the moment)

pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = pred_vars,
                         filter_vars = pred_vars,
                         filter_var = TRUE,
                         add_mid = TRUE)

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, xvars = pred_vars)
g

if(save_figs) {x
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", s, ".jpeg"),
     units = "in", res = 600, width = 5.5, height = 6 )
  g 
dev.off()
}

Save output

# glm models
mods2save <- butcher::butcher(mod_glmFinal) # removes some model components so the saved object isn't huge

mods2save$formula <- best_form
#mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
n <- nrow(df_sample)
mods2save$data_rows <- n

if (byRegion == TRUE){
  mods2save_WF <- butcher::butcher(mod_glmFinal_WF) # removes some model components so the saved object isn't huge

mods2save_WF$formula <- best_form_WF
#mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
mods2save_WF$data_rows <- nrow(wf_sample)

mods2save_EF <- butcher::butcher(mod_glmFinal_EF) # removes some model components so the saved object isn't huge

mods2save_EF$formula <- best_form_EF
#mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
mods2save_EF$data_rows <- nrow(ef_sample)

  mods2save_G <- butcher::butcher(mod_glmFinal_G) # removes some model components so the saved object isn't huge

mods2save_G$formula <- best_form_G
#mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
mods2save_G$data_rows <- nrow(g_sample)
}

if(!test_run) {
  saveRDS(mods2save, 
        paste0("./models/glm_beta_model_CONUSwide_", s, "_n", n, 
        #sample_group, 
        ".RDS"))
  if (byRegion == TRUE) {
    ## western forests
     saveRDS(mods2save_WF, 
        paste0("./models/glm_beta_model_WesternForests_", s, "_n", n, 
        #sample_group, 
        ".RDS"))
    ## eastern forests
     saveRDS(mods2save_EF, 
        paste0("./models/glm_beta_model_EasternForests_", s, "_n", n, 
        #sample_group, 
        ".RDS"))
     ## grass/shrub
     saveRDS(mods2save_G, 
        paste0("./models/glm_beta_model_GrassShrub_", s, "_n", n, 
        #sample_group, 
        ".RDS"))
  }
}

session info

Hash of current commit (i.e. to ID the version of the code used)

system("git rev-parse HEAD", intern=TRUE)
## [1] "4cabefa72252f87135d59263bd071d67b047b86e"

Packages etc.

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] StepBeta_2.1.0  ggtext_0.1.2    knitr_1.46      gridExtra_2.3  
##  [5] pdp_0.8.1       GGally_2.2.1    lubridate_1.9.3 forcats_1.0.0  
##  [9] dplyr_1.1.4     purrr_1.0.2     readr_2.1.5     tidyr_1.3.1    
## [13] tibble_3.2.1    tidyverse_2.0.0 betareg_3.1-4   caret_6.0-94   
## [17] lattice_0.22-6  ggplot2_3.5.1   sf_1.0-16       tidyterra_0.6.1
## [21] terra_1.7-78    dtplyr_1.3.1    patchwork_1.2.0 stringr_1.5.1  
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3            pROC_1.18.5          sandwich_3.1-0      
##  [4] rlang_1.1.4          magrittr_2.0.3       e1071_1.7-14        
##  [7] compiler_4.4.0       mgcv_1.9-1           flexmix_2.3-19      
## [10] vctrs_0.6.5          reshape2_1.4.4       combinat_0.0-8      
## [13] pkgconfig_2.0.3      fastmap_1.2.0        labeling_0.4.3      
## [16] utf8_1.2.4           rmarkdown_2.27       markdown_1.13       
## [19] prodlim_2024.06.25   tzdb_0.4.0           xfun_0.44           
## [22] modeltools_0.2-23    cachem_1.1.0         jsonlite_1.8.8      
## [25] recipes_1.1.0        highr_0.10           parallel_4.4.0      
## [28] R6_2.5.1             bslib_0.7.0          stringi_1.8.4       
## [31] RColorBrewer_1.1-3   parallelly_1.37.1    rpart_4.1.23        
## [34] lmtest_0.9-40        jquerylib_0.1.4      Rcpp_1.0.12         
## [37] iterators_1.0.14     future.apply_1.11.2  zoo_1.8-12          
## [40] butcher_0.3.4        Matrix_1.7-0         splines_4.4.0       
## [43] nnet_7.3-19          timechange_0.3.0     tidyselect_1.2.1    
## [46] rstudioapi_0.16.0    yaml_2.3.8           timeDate_4032.109   
## [49] codetools_0.2-20     listenv_0.9.1        plyr_1.8.9          
## [52] withr_3.0.0          evaluate_0.23        future_1.33.2       
## [55] survival_3.5-8       ggstats_0.6.0        units_0.8-5         
## [58] proxy_0.4-27         xml2_1.3.6           pillar_1.9.0        
## [61] KernSmooth_2.23-22   foreach_1.5.2        stats4_4.4.0        
## [64] generics_0.1.3       hms_1.1.3            commonmark_1.9.1    
## [67] aod_1.3.3            munsell_0.5.1        scales_1.3.0        
## [70] gtools_3.9.5         globals_0.16.3       class_7.3-22        
## [73] glue_1.7.0           tools_4.4.0          data.table_1.15.4   
## [76] ModelMetrics_1.2.2.2 gower_1.0.1          grid_4.4.0          
## [79] ipred_0.9-15         colorspace_2.1-0     nlme_3.1-164        
## [82] Formula_1.2-5        cli_3.6.2            fansi_1.0.6         
## [85] viridisLite_0.4.2    lava_1.8.0           gtable_0.3.5        
## [88] sass_0.4.9           digest_0.6.35        classInt_0.4-10     
## [91] farver_2.1.2         htmltools_0.5.8.1    lifecycle_1.0.4     
## [94] hardhat_1.4.0        gridtext_0.1.5       MASS_7.3-60.2